# read in the data and rename the column name to align with cloud data in the data dictionaries
image1<- as.data.frame(read.table("image_data/image1.txt"))
image1$picture<- rep("image1", nrow(image1))
colnames(image1) [1:11] <- c("y_coordinate", "x_coordinate", "label", 
                             "NDAI", "SD", "CORR", "DF_angle", "CF_angle" , 
                             "BF_angle", "AF_angle", "AN_angle")

image2<- as.data.frame(read.table("image_data/image2.txt"))
image2$picture<- rep("image2", nrow(image2))
colnames(image2) [1:11] <- c("y_coordinate", "x_coordinate", "label", 
                             "NDAI", "SD", "CORR", "DF_angle", 
                             "CF_angle" , "BF_angle", "AF_angle", "AN_angle")

image3<- as.data.frame(read.table("image_data/image3.txt"))
image3$picture<- rep("image3", nrow(image3))
colnames(image3) [1:11] <- c("y_coordinate", "x_coordinate", "label",
                             "NDAI", "SD", "CORR", "DF_angle", 
                             "CF_angle" , "BF_angle", "AF_angle", "AN_angle")

Problem 1b Summarize Data

total_df <- rbind(image1, image2, image3)

ggplot(image1)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, 
                                            color = factor(label)))+
  xlab(" x coordinate") +
  ylab(" y coordinate") +
  ggtitle("Coordinate Map image1 with label ")+ 
  theme_bw()

ggplot(image2)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
                                            color = factor(label)))+
  xlab(" x coordinate") +
  ylab(" y coordinate") + 
  ggtitle("Coordinate Map image2 with label ")+ 
  theme_bw()

ggplot(image3)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, 
                                            color = factor(label)))+
  xlab(" x coordinate") + 
  ylab(" y coordinate") +
  ggtitle("Coordinate Map image3 with label ")+ 
  theme_bw()

#Check if there's any missing data
sum(is.na(total_df))
## [1] 0
#There is no missing value 
summary(total_df)
##   y_coordinate    x_coordinate       label              NDAI        
##  Min.   :  2.0   Min.   : 65.0   Min.   :-1.0000   Min.   :-1.8420  
##  1st Qu.: 98.0   1st Qu.:143.0   1st Qu.:-1.0000   1st Qu.:-0.4286  
##  Median :193.0   Median :218.0   Median : 0.0000   Median : 1.3476  
##  Mean   :193.1   Mean   :218.1   Mean   :-0.1334   Mean   : 1.0847  
##  3rd Qu.:289.0   3rd Qu.:294.0   3rd Qu.: 0.0000   3rd Qu.: 2.3142  
##  Max.   :383.0   Max.   :369.0   Max.   : 1.0000   Max.   : 4.5639  
##        SD                CORR            DF_angle         CF_angle     
##  Min.   :  0.1987   Min.   :-0.3872   Min.   : 45.28   Min.   : 31.19  
##  1st Qu.:  1.6376   1st Qu.: 0.1253   1st Qu.:244.56   1st Qu.:219.27  
##  Median :  4.3095   Median : 0.1603   Median :281.91   Median :259.31  
##  Mean   :  8.0633   Mean   : 0.1860   Mean   :271.36   Mean   :246.37  
##  3rd Qu.: 10.2264   3rd Qu.: 0.2231   3rd Qu.:300.39   3rd Qu.:279.59  
##  Max.   :117.5810   Max.   : 0.8144   Max.   :410.53   Max.   :360.68  
##     BF_angle         AF_angle         AN_angle        picture         
##  Min.   : 24.49   Min.   : 21.07   Min.   : 20.57   Length:345556     
##  1st Qu.:200.79   1st Qu.:185.16   1st Qu.:174.88   Class :character  
##  Median :236.17   Median :211.54   Median :197.58   Mode  :character  
##  Mean   :224.20   Mean   :201.71   Mean   :188.29                     
##  3rd Qu.:258.62   3rd Qu.:235.15   3rd Qu.:216.80                     
##  Max.   :335.08   Max.   :318.70   Max.   :306.93
#Calculate the proportion 
percentage_label<- total_df %>%
  group_by(picture, label) %>%
  summarise (n = n()) %>%
  mutate(freq = n / sum(n))
percentage_label
g1 <- ggplot(data=percentage_label[percentage_label$picture=='image1',], 
  aes(x=label, y=freq)) +
  geom_bar(stat="identity", color="black", fill="lightskyblue3")+
  ylab("Proportion")+
  ylim(0,0.6)
g2 <- ggplot(data=percentage_label[percentage_label$picture=='image2',], 
  aes(x=label, y=freq)) +
  geom_bar(stat="identity", color="black", fill="lightskyblue3")+
  ylab(NULL)+
  ylim(0,0.6)
g3 <- ggplot(data=percentage_label[percentage_label$picture=='image3',], 
  aes(x=label, y=freq)) +
  geom_bar(stat="identity", color="black", fill="lightskyblue3")+
  ylab(NULL)+
  ylim(0,0.6)
grid.arrange(g1,g2,g3,ncol=3,top = textGrob("Proportion of label in each image",
                                            gp = gpar(fontsize = 16))) 

Problem 1c EDA

labels<- factor(total_df$label)
cor(total_df[,c(-12)])
##              y_coordinate x_coordinate        label       NDAI         SD
## y_coordinate  1.000000000 -0.006376002 -0.213372321 -0.3276781 -0.2646997
## x_coordinate -0.006376002  1.000000000 -0.465822566 -0.5231960 -0.3233889
## label        -0.213372321 -0.465822566  1.000000000  0.6169346  0.2954477
## NDAI         -0.327678096 -0.523195958  0.616934624  1.0000000  0.6310626
## SD           -0.264699672 -0.323388894  0.295447745  0.6310626  1.0000000
## CORR         -0.254309102 -0.361793127  0.444059231  0.4034998  0.2968385
## DF_angle      0.378983970  0.081274648  0.006550085 -0.1610916 -0.2061691
## CF_angle      0.472777594  0.269869879 -0.208279170 -0.3622113 -0.3688601
## BF_angle      0.520812966  0.339539816 -0.337948500 -0.4629301 -0.4404404
## AF_angle      0.530441793  0.368160603 -0.389741017 -0.4927484 -0.4555423
## AN_angle      0.515677887  0.383211616 -0.389358825 -0.4895267 -0.4466229
##                    CORR     DF_angle   CF_angle   BF_angle   AF_angle
## y_coordinate -0.2543091  0.378983970  0.4727776  0.5208130  0.5304418
## x_coordinate -0.3617931  0.081274648  0.2698699  0.3395398  0.3681606
## label         0.4440592  0.006550085 -0.2082792 -0.3379485 -0.3897410
## NDAI          0.4034998 -0.161091626 -0.3622113 -0.4629301 -0.4927484
## SD            0.2968385 -0.206169130 -0.3688601 -0.4404404 -0.4555423
## CORR          1.0000000  0.126283481 -0.1660966 -0.4311043 -0.6039353
## DF_angle      0.1262835  1.000000000  0.8495716  0.6991073  0.5910958
## CF_angle     -0.1660966  0.849571562  1.0000000  0.9119430  0.8216176
## BF_angle     -0.4311043  0.699107255  0.9119430  1.0000000  0.9529627
## AF_angle     -0.6039353  0.591095794  0.8216176  0.9529627  1.0000000
## AN_angle     -0.6820440  0.547609821  0.7727499  0.9043409  0.9706198
##                AN_angle
## y_coordinate  0.5156779
## x_coordinate  0.3832116
## label        -0.3893588
## NDAI         -0.4895267
## SD           -0.4466229
## CORR         -0.6820440
## DF_angle      0.5476098
## CF_angle      0.7727499
## BF_angle      0.9043409
## AF_angle      0.9706198
## AN_angle      1.0000000
corrplot.mixed(cor(total_df[,c(-3, -12)]))

G<- ggplot(total_df)+geom_density(aes(x= NDAI, group= label, 
                                      color=labels, fill= labels),
                                  color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of NDAI based on labels")+
  theme_minimal()
 
H<- ggplot(total_df)+geom_density(aes(x= log(SD), group= label,
                                      color=labels, fill= labels), 
                                  color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of Log SD based on labels")+
  theme_minimal()

I<- ggplot(total_df)+geom_density(aes(x= CORR, group= label, 
                                      color=labels, fill= labels), 
                                  color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of CORR based on labels")+
  theme_minimal()

A<- ggplot(total_df)+geom_density(aes(x= DF_angle, group= label, 
                                      color=labels, fill= labels), 
                                  color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of DF_angle based on labels")+
  theme_minimal()

B<- ggplot(total_df)+geom_density(aes(x= CF_angle, group= label, 
                                      color=labels, fill= labels),
                                  color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of CF_angle based on labels")+
  theme_minimal()

C<- ggplot(total_df)+geom_density(aes(x= BF_angle, group= label, 
                                      color=labels, fill= labels), 
                                  color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of BF_angle based on labels")+
  theme_minimal()

D<-ggplot(total_df)+geom_density(aes(x= AF_angle, group= label,
                                     color=labels, fill= labels),
                                 color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of AF_angle based on labels")+
  theme_minimal()

E<- ggplot(total_df)+geom_density(aes(x= AN_angle, group= label,
                                      color=labels, fill= labels), 
                                  color= "black",alpha = 0.7)+
  ggtitle("Overlaying histogram of AN_angle based on labels")+
  theme_minimal()

plot_grid(B,C,D,E,nrow=2, align="h")

plot_grid(A,G,H,I,nrow=2, align="h")

Problem 2a Data Split

# Filter all the undefined label
image1<- image1 %>% filter(label !=0)
image2<- image2 %>% filter(label !=0)
image3<- image3 %>% filter(label !=0)

Split method 1 By label

traintest_split<- function(data){
  res<-list()
  trainIndex <- createDataPartition(data$label, p = .8, 
                                  list = FALSE, 
                                  times = 1)
  res$train<- data[ trainIndex,]
  res$test<-  data[-trainIndex,]
  return(res)
}
trainval_split<- function(data){
  res<-list()
  valIndex <- createDataPartition(data$label, p = .2, 
                                  list = FALSE, 
                                  times = 1)
  res$val<- data[ valIndex,]
  res$train<-  data[-valIndex,]
  return(res)
}

##image 1
#Test-train split
train1_label_split1<- traintest_split(image1)$train
# get test from train-test split
test_label_split1<- traintest_split(image1)$test

#Train-val split ---get val
val_label_split1<- trainval_split(train1_label_split1)$val
#Get train
train_label_split1<-trainval_split(train1_label_split1)$train

##image 2
train2_label_split2<- traintest_split(image2)$train
# get test from train-test split
test_label_split2<- traintest_split(image2)$test
#Train-val split ---get val
val_label_split2<- trainval_split(train2_label_split2)$val
#Get train
train_label_split2<-trainval_split(train2_label_split2)$train

##image 3
train3_label_split3<- traintest_split(image3)$train
# get test from train-test split
test_label_split3<- traintest_split(image3)$test

#Train-val split ---get val
val_label_split3<- trainval_split(train3_label_split3)$val
#Get train
train_label_split3<-trainval_split(train3_label_split3)$train

method1_train<-rbind(train_label_split1,train_label_split2,train_label_split3)
method1_val<-rbind(val_label_split1,val_label_split2,val_label_split3)
method1_test<- rbind(test_label_split1,test_label_split2,test_label_split3)
method1_train_val<-rbind(method1_train,method1_val)
method1_train_val$label<-factor(method1_train_val$label)

Split method 2 By block

#Check Duplicates
image1 %>% distinct()
grid_row <- 10
grid_col <- 10
datasplit<- function(grid_row, grid_col, image){
  ret <- list()
  train_data <- data.frame()
  val_data <- data.frame()
  test_data <- data.frame()
  min_y_cor <- min(image$y_coordinate)
  min_x_cor <- min(image$x_coordinate)
  max_y_cor <- max(image$y_coordinate)
  max_x_cor <- max(image$x_coordinate)
  divide_row <- seq(min_y_cor,max_y_cor+1,(max_y_cor+1-min_y_cor)/(grid_row-1))
  divide_col <- seq(min_x_cor,max_x_cor+1,(max_x_cor+1-min_x_cor)/(grid_col-1))
  for (i in 1:length(divide_row)){
    for (j in 1:length(divide_col)){
      chunk <- image[image$y_coordinate>=divide_row[i] & image$y_coordinate<divide_row[i+1] & 
                       image$x_coordinate>=divide_col[j] & image$x_coordinate<divide_col[j+1], ]
      test_and_val <- floor(nrow(chunk)*0.4)
      test_and_val_ind <- sample(seq_len(nrow(chunk)), size = test_and_val)
      val_size <- floor(test_and_val/2)
      val_ind_ind <- sample(length(test_and_val_ind), size = val_size)
      val_ind <- test_and_val_ind[val_ind_ind]
      test_ind <- test_and_val_ind[-val_ind_ind]
      test<- chunk[test_ind, ]
      val<- chunk[val_ind, ]
      train<- chunk[-test_and_val_ind, ]
      train_data = rbind(train_data,train)
      test_data = rbind(test_data,test)
      val_data = rbind(val_data,val)
    }
  }
  ret$training <- train_data
  ret$validation <- val_data
  ret$test <- test_data
  return(ret)
}

split_result_1 <- datasplit(10,10,image1)
split_result_2 <- datasplit(10,10,image2)
split_result_3 <- datasplit(10,10,image3)
train_total<- rbind(split_result_1$training,split_result_2$training,split_result_3$training)
val_total<- rbind(split_result_1$validation,split_result_2$validation,split_result_3$validation)
test_total<- rbind(split_result_1$test,split_result_2$test,split_result_3$test)
train_total$picture<- NULL
val_total$picture<- NULL
test_total$picture<-NULL
train_total$label<-factor(train_total$label)

Problem 2b Baseline

test_trivial<- test_total
test_trivial$label<--1
train_trivial<- train_total
train_trivial$label<--1
val_trivial<- val_total
val_trivial$label<--1
sum(test_trivial$label ==test_total$label)/length(test_total$label)
## [1] 0.6095732
sum(val_trivial$label ==val_total$label)/length(val_trivial$label)
## [1] 0.6107271
sum(train_trivial$label ==train_total$label)/length(train_trivial$label)
## [1] 0.6112134

Problem 2c First Order Importance

Visualization

train_val<- rbind(train_total, val_total)
train_val$label<- as.factor(train_val$label)
test_total$label<- as.factor(test_total$label)
#visualization by using boxplot
A<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= y_coordinate,
                                      color= factor(label)))+ theme_bw()+ 
  ggtitle("y_coordinate and label")+xlab("label")+
  theme(plot.title = element_text(size = rel(1), 
                                  vjust = 1.5,face="bold.italic"))+
  theme(legend.position="none")

B<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= x_coordinate,
                                      color= factor(label)))+ theme_bw()+ 
  ggtitle("x_coordinate and label")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), 
                                                vjust = 1.5,face="bold.italic"))

C<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= NDAI,
                                      color= factor(label)))+ theme_bw()+ 
  ggtitle("label and NDAI")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1),
                                                vjust = 1.5,face="bold.italic"))

D<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= SD,color= factor(label)))+ theme_bw()+ 
  ggtitle("label and SD")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1),
                                                vjust = 1.5,face="bold.italic"))

E<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= CORR,color= factor(label)))+ theme_bw()+ 
  ggtitle("label and CORR")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1),
                                                vjust = 1.5,face="bold.italic"))

G<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= DF_angle,
                                      color= factor(label)))+ theme_bw()+ 
  theme(legend.position="none")+
  ggtitle("label and DF_angle")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), 
                                                vjust = 1.5,face="bold.italic"))

H<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= CF_angle,
                                      color= factor(label)))+ theme_bw()+
  theme(legend.position="none")+
  ggtitle("label and CF_angle")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), 
                                                vjust = 1.5,face="bold.italic"))

I<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= BF_angle,
                                      color= factor(label)))+ theme_bw()+
  theme(legend.position="none")+
  ggtitle("label and BF_angle")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), 
                                                vjust = 1.5,face="bold.italic"))

J<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= AF_angle,
                                      color= factor(label)))+ theme_bw()+ 
  theme(legend.position="none")+
  ggtitle("label and AF_angle")+
  xlab("label")+theme(plot.title = element_text(size = rel(1), 
                                                vjust = 1.5,face="bold.italic"))

K<-I<-ggplot(train_val)+geom_boxplot(aes(x=factor(label), y= AN_angle,
                                         color= factor(label)))+ theme_bw()+ 
  ggtitle("label and AN_angle")+theme(legend.position="none")+
  xlab("label")+theme(plot.title = element_text(size = rel(1),
                                                vjust = 1.5,face="bold.italic"))
 
plot_grid(A, B,C,D,E,G,H,I,J,K,nrow=3, align="h")

Quantitative

summary(lm(as.numeric(label)~NDAI, data = train_val))$r.squared
## [1] 0.5744323
summary(lm(as.numeric(label)~CORR, data = train_val))$r.squared
## [1] 0.3035037
summary(lm(as.numeric(label)~AF_angle, data = train_val))$r.squared
## [1] 0.257956
summary(lm(as.numeric(label)~x_coordinate, data = train_val))$r.squared
## [1] 0.3264572
summary(lm(as.numeric(label)~y_coordinate, data = train_val))$r.squared
## [1] 0.07865887
summary(lm(as.numeric(label)~SD, data = train_val))$r.squared
## [1] 0.1908709
summary(lm(as.numeric(label)~DF_angle, data = train_val))$r.squared
## [1] 0.0001069918
summary(lm(as.numeric(label)~CF_angle, data = train_val))$r.squared
## [1] 0.08052235
summary(lm(as.numeric(label)~AN_angle, data = train_val))$r.squared
## [1] 0.2553198

Problem 3a

Model 1: Logistic Regression

# Compute CV loss
seed = 123
K = 5
source("CVgeneric.R")
cv_result <- CVgeneric("logistic", c("y_coordinate", "x_coordinate", 
                                     "NDAI", "SD", "CORR", "DF_angle", 
                                     "CF_angle", "BF_angle", "AF_angle", 
                                     'AN_angle'), "label",
                       K, data=train_val, loss= accuracy, seed)
set.seed(cv_result$seed)
cv_result_method_2 <- CVgeneric("logistic", c("y_coordinate", "x_coordinate",
                                              "NDAI", "SD", "CORR", "DF_angle", 
                                              "CF_angle", "BF_angle", "AF_angle", 
                                              'AN_angle'), "label", 
                                K, data=method1_train_val, loss= accuracy, seed)
# Use data based on the best folds
train_data = train_val[-cv_result$index, ]
# Train logistic model
logistic_model<- train(label ~ .,  data=train_data, method="glm", family="binomial")
# Test accuracy
predicted.classes <- logistic_model %>% predict(test_total[,-3])
mean(predicted.classes == test_total$label)
## [1] 0.8966023
# Other error metrics
# F1 Score
F1_Score(as.numeric(predicted.classes), as.numeric(test_total$label))
## [1] 0.9148915
# Sensitivity
Sensitivity(as.numeric(predicted.classes), as.numeric(test_total$label))
## [1] 0.9181057
# Find the best cut off point
predictions<- data.frame(predict(logistic_model, newdata =test_total[,-3],
                                 type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot",
     show.spread.at= seq(0.1, 0.9, by=0.1), 
     main="Logistic regression cutoff and performance tradeoff")

index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]

perf <- performance(pred, "tpr", "fpr")
# ROC curve plot
plot(perf, colorize=T, print.cutoffs.at=c(max),
     text.adj=c(1.5,0.2),
     avg="threshold", lwd=3,
     main= "Roc curve for logistic regression")
abline(a=0,b=1)

#### Model 2: LDA

# Compute CV loss
cv_result_lda<-CVgeneric("lda", c("y_coordinate", "x_coordinate", 
                                  "NDAI", "SD", "CORR", "DF_angle", 
                                  "CF_angle", "BF_angle", "AF_angle", 'AN_angle'),
                         "label",
                         K = 5, train_val, loss = precision, seed)
cv_result_lda_method_2<-CVgeneric("lda", c("y_coordinate", "x_coordinate", 
                                           "NDAI", "SD", "CORR", "DF_angle",
                                           "CF_angle", "BF_angle", "AF_angle", 'AN_angle'),
                        "label",
                        K = 5, method1_train_val, loss = precision, seed)
# Use data based on the best folds
train_data = train_val[-cv_result_lda$index, ]
# Fit LDA Model
lda.model = lda(factor(label)~., data=train_val)
lda_pred<-predict(lda.model, newdata=test_total[,-3])
lda_pre2<-predict(lda.model, newdata=test_total[,-3], type=  "prob")
  
#Other error metrics
#F1 Score
F1_Score(as.numeric(lda_pred$class), as.numeric(test_total$label))
## [1] 0.9165936
#Sensitivity
Sensitivity(as.numeric(lda_pred$class), as.numeric(test_total$label))
## [1] 0.9271657
#Accuracy of the model
mean(lda_pred$class == test_total$label)
## [1] 0.8994617
# Find the best cut off point
predictions<- data.frame(predict(lda.model, newdata =test_total[,-3] ))
colnames(predictions) <- c("label",-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", show.spread.at= seq(0.1, 0.9, by=0.1), main="LDA cutoff and performance tradeoff")

index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
# ROC Curve
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(1,0), avg="threshold", lwd=3, main= "LDA curve for ROC")
abline(a=0,b=1)

Model 3: Decision Tree

# Delete unused column
method1_train_val$picture<-NULL
# Compute CV loss for first method split
seed = 123
K = 5
cv_result_decision_tree <- CVgeneric("decision tree", 
                                     c("y_coordinate",  "x_coordinate",  "NDAI","SD","CORR","DF_angle","CF_angle","BF_angle","AF_angle",'AN_angle'),"label",K,data=train_val,loss= accuracy, seed)
# Use data based on the best folds for the first method split
train_data = train_val[-cv_result_decision_tree$index, ]

# Train the decision tree model
tree = rpart(label ~ ., data=train_data,maxdepth =10, minsplit= 10)
# Visualize the decision tree
fancyRpartPlot(tree)

# Train the decision tree model without the geographic features
tree_no_xy = rpart(label ~ NDAI+SD+CORR+DF_angle+CF_angle+BF_angle+AF_angle+AN_angle , data=train_data,maxdepth =10, minsplit= 10)
# Visualize the decision tree model without the geographic features
fancyRpartPlot(tree_no_xy)

# Accuracy for decision tree model without the geographic features
tree.pred_no_xy = predict(tree_no_xy, newdata=test_total[,-3],type = 'class')
mean(tree.pred_no_xy==test_total$label)
## [1] 0.9050846
# Accuracy for  decision tree model WITH the geographic features
tree.pred = predict(tree, newdata=test_total[,-3],method = 'response')
tree.pred2 = predict(tree, newdata=test_total[,-3],type = 'class')
mean(tree.pred2==test_total$label)
## [1] 0.9222174
#Other error metrics
#F1 Score
F1_Score(as.numeric(tree.pred2), as.numeric(test_total$label))
## [1] 0.9336694
#Sensitivity
Sensitivity(as.numeric(tree.pred2), as.numeric(test_total$label))
## [1] 0.9722187
# Find the best cut off point
predictions<- data.frame(predict(tree, newdata =test_total[,-3], type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", 
     show.spread.at= seq(0.1, 0.9, by=0.1), 
     main="Decision Tree cutoff and performance tradeoff")

index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
# ROC Curve
plot(perf, colorize=T, print.cutoffs.at=max, text.adj=c(1,0), avg="threshold", 
     lwd=3, 
     main= "Roc curve for Decision Tree")
abline(a=0,b=1)

# Decision Tree feature importance (4a preparation)
tree_importance<-as.data.frame(varImp(tree))
tree_importance$importance<-rownames(tree_importance)
tree_importance <- tree_importance %>% arrange(desc(Overall))

# Compute CV loss for second method data split
cv_result_decision_tree_method2 <- CVgeneric("decision tree", 
                                             c("y_coordinate", "x_coordinate", 
                                               "NDAI", "SD", "CORR", "DF_angle", 
                                               "CF_angle", "BF_angle", "AF_angle",
                                               'AN_angle'), "label",K, 
                                             data=method1_train_val, 
                                             loss= accuracy, seed)
train_data_tree_2 = method1_train_val[-cv_result_decision_tree_method2$index, ]
tree2 = rpart(label ~ ., data=train_data_tree_2,maxdepth =10, minsplit= 10)
tree.pred_method_2 <- predict(tree2, newdata=test_total[,-3],type = 'class')

Model 4:Random Forest

seed = 123
K = 5
# Compute CV loss for first method split
cv_result_rf <- CVgeneric("random forest", c("y_coordinate", 
                                             "x_coordinate", 
                                             "NDAI", "SD", "CORR", 
                                             "DF_angle", "CF_angle", 
                                             "BF_angle", "AF_angle", 
                                             'AN_angle'), "label", K, 
                          data=train_val, loss= accuracy, seed)
# Use the best folds on the training set
train_data_rf = train_val[-cv_result_rf$index, ]
# Choose the best hyperparameter of ntry which is the Number of variables randomly sampled as candidates at each split.
Random_forest_hyperparameter_tune<-tuneRF(x =train_data[,-3],
              y = as.factor(train_data[,3]),
              ntreeTry = 50)
## mtry = 3  OOB error = 0.46% 
## Searching left ...
## mtry = 2     OOB error = 0.56% 
## -0.2166124 0.05 
## Searching right ...
## mtry = 6     OOB error = 0.36% 
## 0.2084691 0.05 
## mtry = 10    OOB error = 0.45% 
## -0.2222222 0.05

# Plot the ntry
plot(data.frame(Random_forest_hyperparameter_tune)$mtry,
     data.frame(Random_forest_hyperparameter_tune)$OOBError, 
     type = "l",ylab="OOBError",xlab="mtry", 
     main = "Random Forest mtry tune")

#Hyperparameter in R for random Forest
#ntree: Number of trees to grow.

# Fit the random forest model
rf_model <- randomForest(as.factor(label) ~ y_coordinate + x_coordinate +
                           NDAI + SD + CORR + DF_angle + CF_angle + BF_angle +
                           AF_angle + AN_angle, data = train_data_rf,
                         importance = TRUE,
                         ntree=50,
                         ntry=6,
                         maxnodes= 500, 
                         nodesize = 10)
# Compute the accuracy of the model
predicted.classes_rf <- rf_model %>% predict(test_total[,-3])
mean(predicted.classes_rf == test_total$label)
## [1] 0.9847414
#Other error metrics
#F1 Score
F1_Score(as.numeric(predicted.classes_rf), as.numeric(test_total$label))
## [1] 0.9873865
#Sensitivity
Sensitivity(as.numeric(predicted.classes_rf), as.numeric(test_total$label))
## [1] 0.9951552
# Find the best cut off point
predictions<- data.frame(predict(rf_model, newdata =test_total[,-3], type= "prob" ))
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", 
     show.spread.at= seq(0.1, 0.9, by=0.1), 
     main="Random Foret cutoff and performance tradeoff")

index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
perf <- performance(pred, "tpr", "fpr")
# ROC curve
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(0.5,0), 
     avg="threshold", lwd=3, main= "Roc curve for Random Forest")
abline(a=0,b=1)

# Feature importance for random forest model
varImp(rf_model)
#CV method 2
cv_result_rf_method2 <- CVgeneric("random forest", 
                                  c("y_coordinate", "x_coordinate", "NDAI", "SD", 
                                    "CORR", "DF_angle", "CF_angle", "BF_angle", 
                                    "AF_angle", 'AN_angle'), 
                                  "label", K, data=method1_train_val, 
                                  loss= accuracy, 
                                  seed)
#Print accuracy result
#cv_result_rf_method2
#train data by using the best training sets
train_data_rf2 = method1_train_val[-cv_result_rf_method2$index, ]

rf_model_method_2 <- randomForest(as.factor(label) ~ y_coordinate + 
                                    x_coordinate + NDAI + SD + CORR +
                                    DF_angle + CF_angle + BF_angle +AF_angle + 
                                    AN_angle,
                                  data = train_data_rf2, 
                                  importance = TRUE,ntree=50,
                                  ntry=6,maxnodes= 500, nodesize = 10)

predicted.classes_rf_method_2 <- rf_model_method_2 %>% predict(test_total[,-3])

Problem 4 Diagnostics

4a

# Decision Tree feature importance
ggplot(data=tree_importance, aes(x=reorder(importance, -Overall), y=Overall)) +
  geom_bar(stat="identity", color="black", fill="lightgreen")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  xlab("features")+
  ylab("importance")+
  ggtitle("Feature importance in Decision Tree")

# Balance the cost complexity and error
fit <- rpart(label~.,
   method="anova", data=train_val)
printcp(fit) # display the results 
## 
## Regression tree:
## rpart(formula = label ~ ., data = train_val, method = "anova")
## 
## Variables actually used in tree construction:
## [1] AN_angle     CORR         NDAI         x_coordinate
## 
## Root node error: 39557/166443 = 0.23766
## 
## n= 166443 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.643321      0   1.00000 1.00003 0.0011172
## 2 0.035724      1   0.35668 0.35749 0.0021904
## 3 0.020565      2   0.32095 0.32146 0.0021557
## 4 0.020420      3   0.30039 0.30118 0.0020043
## 5 0.015439      4   0.27997 0.28069 0.0018738
## 6 0.010000      5   0.26453 0.26563 0.0019048
plotcp(fit)

summary(fit) 
## Call:
## rpart(formula = label ~ ., data = train_val, method = "anova")
##   n= 166443 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.64332087      0 1.0000000 1.0000287 0.001117176
## 2 0.03572423      1 0.3566791 0.3574913 0.002190419
## 3 0.02056532      2 0.3209549 0.3214584 0.002155653
## 4 0.02042008      3 0.3003896 0.3011766 0.002004315
## 5 0.01543850      4 0.2799695 0.2806913 0.001873813
## 6 0.01000000      5 0.2645310 0.2656329 0.001904752
## 
## Variable importance
##         NDAI           SD         CORR     AN_angle     AF_angle 
##           25           18           15           15           13 
##     BF_angle x_coordinate     CF_angle     DF_angle 
##           12            1            1            1 
## 
## Node number 1: 166443 observations,    complexity param=0.6433209
##   mean=1.388908, MSE=0.2376585 
##   left son=2 (90542 obs) right son=3 (75901 obs)
##   Primary splits:
##       NDAI         < 0.7873264  to the left,  improve=0.6433209, (0 missing)
##       SD           < 3.102224   to the left,  improve=0.4642605, (0 missing)
##       CORR         < 0.197072   to the left,  improve=0.4462696, (0 missing)
##       x_coordinate < 229.5      to the right, improve=0.2655891, (0 missing)
##       AN_angle     < 176.503    to the right, improve=0.2638133, (0 missing)
##   Surrogate splits:
##       SD       < 3.117993   to the left,  agree=0.868, adj=0.710, (0 split)
##       CORR     < 0.1949221  to the left,  agree=0.806, adj=0.575, (0 split)
##       AN_angle < 181.8891   to the right, agree=0.782, adj=0.522, (0 split)
##       AF_angle < 186.8958   to the right, agree=0.762, adj=0.478, (0 split)
##       BF_angle < 244.978    to the right, agree=0.748, adj=0.447, (0 split)
## 
## Node number 2: 90542 observations,    complexity param=0.02042008
##   mean=1.030903, MSE=0.0299478 
##   left son=4 (89372 obs) right son=5 (1170 obs)
##   Primary splits:
##       AN_angle < 172.344    to the right, improve=0.2978938, (0 missing)
##       AF_angle < 177.9636   to the right, improve=0.1968573, (0 missing)
##       NDAI     < 0.3928416  to the left,  improve=0.1692084, (0 missing)
##       CORR     < 0.2076106  to the left,  improve=0.1624296, (0 missing)
##       BF_angle < 190.1277   to the right, improve=0.1449216, (0 missing)
##   Surrogate splits:
##       AF_angle     < 178.9546   to the right, agree=0.996, adj=0.674, (0 split)
##       BF_angle     < 189.9061   to the right, agree=0.992, adj=0.416, (0 split)
##       CF_angle     < 197.6788   to the right, agree=0.990, adj=0.211, (0 split)
##       DF_angle     < 209.3323   to the right, agree=0.989, adj=0.119, (0 split)
##       y_coordinate < 21.5       to the right, agree=0.988, adj=0.068, (0 split)
## 
## Node number 3: 75901 observations,    complexity param=0.03572423
##   mean=1.815971, MSE=0.1501625 
##   left son=6 (6966 obs) right son=7 (68935 obs)
##   Primary splits:
##       x_coordinate < 295.5      to the right, improve=0.12398610, (0 missing)
##       CORR         < 0.1898961  to the left,  improve=0.11638770, (0 missing)
##       y_coordinate < 45.5       to the left,  improve=0.07716196, (0 missing)
##       DF_angle     < 261.395    to the left,  improve=0.06285027, (0 missing)
##       SD           < 2.739462   to the left,  improve=0.05186801, (0 missing)
##   Surrogate splits:
##       AF_angle < 278.0661   to the right, agree=0.909, adj=0.003, (0 split)
##       AN_angle < 260.6509   to the right, agree=0.908, adj=0.003, (0 split)
##       BF_angle < 300.31     to the right, agree=0.908, adj=0.000, (0 split)
##       SD       < 0.5655767  to the left,  agree=0.908, adj=0.000, (0 split)
##       CORR     < -0.3280679 to the left,  agree=0.908, adj=0.000, (0 split)
## 
## Node number 4: 89372 observations
##   mean=1.020096, MSE=0.01969194 
## 
## Node number 5: 1170 observations
##   mean=1.85641, MSE=0.1229717 
## 
## Node number 6: 6966 observations,    complexity param=0.0154385
##   mean=1.386736, MSE=0.2371712 
##   left son=12 (4673 obs) right son=13 (2293 obs)
##   Primary splits:
##       AN_angle     < 196.6854   to the left,  improve=0.3696399, (0 missing)
##       AF_angle     < 210.1692   to the left,  improve=0.3273585, (0 missing)
##       BF_angle     < 231.8952   to the left,  improve=0.3106881, (0 missing)
##       y_coordinate < 69.5       to the left,  improve=0.2865935, (0 missing)
##       CF_angle     < 251.0203   to the left,  improve=0.2776899, (0 missing)
##   Surrogate splits:
##       AF_angle     < 210.0945   to the left,  agree=0.949, adj=0.844, (0 split)
##       BF_angle     < 231.8145   to the left,  agree=0.913, adj=0.736, (0 split)
##       CF_angle     < 250.7385   to the left,  agree=0.874, adj=0.616, (0 split)
##       y_coordinate < 77.5       to the left,  agree=0.853, adj=0.553, (0 split)
##       DF_angle     < 264.0044   to the left,  agree=0.846, adj=0.533, (0 split)
## 
## Node number 7: 68935 observations,    complexity param=0.02056532
##   mean=1.859346, MSE=0.1208706 
##   left son=14 (24527 obs) right son=15 (44408 obs)
##   Primary splits:
##       CORR     < 0.1976391  to the left,  improve=0.09763239, (0 missing)
##       SD       < 2.612428   to the left,  improve=0.04331985, (0 missing)
##       DF_angle < 293.1815   to the left,  improve=0.04170463, (0 missing)
##       AF_angle < 227.3604   to the right, improve=0.04085155, (0 missing)
##       NDAI     < 3.325084   to the right, improve=0.03524672, (0 missing)
##   Surrogate splits:
##       AF_angle     < 224.184    to the right, agree=0.718, adj=0.208, (0 split)
##       AN_angle     < 206.1113   to the right, agree=0.718, adj=0.206, (0 split)
##       SD           < 3.537287   to the left,  agree=0.705, adj=0.170, (0 split)
##       DF_angle     < 281.0187   to the left,  agree=0.691, adj=0.132, (0 split)
##       y_coordinate < 276.5      to the right, agree=0.686, adj=0.118, (0 split)
## 
## Node number 12: 4673 observations
##   mean=1.179328, MSE=0.1471695 
## 
## Node number 13: 2293 observations
##   mean=1.80942, MSE=0.1542593 
## 
## Node number 14: 24527 observations
##   mean=1.713173, MSE=0.2045572 
## 
## Node number 15: 44408 observations
##   mean=1.940078, MSE=0.05633103
rsq.rpart(fit)
## 
## Regression tree:
## rpart(formula = label ~ ., data = train_val, method = "anova")
## 
## Variables actually used in tree construction:
## [1] AN_angle     CORR         NDAI         x_coordinate
## 
## Root node error: 39557/166443 = 0.23766
## 
## n= 166443 
## 
##         CP nsplit rel error  xerror      xstd
## 1 0.643321      0   1.00000 1.00003 0.0011172
## 2 0.035724      1   0.35668 0.35749 0.0021904
## 3 0.020565      2   0.32095 0.32146 0.0021557
## 4 0.020420      3   0.30039 0.30118 0.0020043
## 5 0.015439      4   0.27997 0.28069 0.0018738
## 6 0.010000      5   0.26453 0.26563 0.0019048

Diagnostics 4b

#refer section 4d

Diagnostics 4c AdaBoost

# Long running time 
# Fit adaboost model
adaboost<- adaboost(label~y_coordinate + x_coordinate  + NDAI + SD + CORR + DF_angle + CF_angle+ BF_angle + AF_angle + AN_angle ,data=train_val, nIter=10)
# Adaboost model Accuracy 
mean(predict(adaboost, newdata =test_total[,-3] )$class == test_total$label)
## [1] 0.9977413
# Fit adaboost model without geographic information 
boost_no_xy<- boosting(label~NDAI + SD + CORR + DF_angle + CF_angle+ BF_angle + AF_angle + AN_angle ,data=train_val, mfinal = 10, boos=TRUE)

#Error based on number of trees 
errorevol(boost_no_xy,train_val[,c(-1,-2)])->evol.train
errorevol(boost_no_xy,test_total[,c(-1,-2)])->evol.test
plot.errorevol(evol.test,evol.train)

importanceplot(boost_no_xy)

# Margins
BC.margins<-margins(boost_no_xy,train_val[,c(-1,-2)]) # training set
BC.adaboost.pred <- predict.boosting(boost_no_xy,newdata=test_total[,c(-1,-2)])
 
BC.predmargins<-margins(BC.adaboost.pred,test_total[,c(-1,-2)]) # test set
plot.margins(BC.predmargins,BC.margins,alpha=0.3 )

# Adaboost model Accuracy without geographic information 
mean(predict(boost_no_xy, newdata =test_total[,-3] )$class == test_total$label)
## [1] 0.9340638
# Cut off point selection 
predictions<- data.frame(predict(adaboost, newdata =test_total[,-3] )$prob)
colnames(predictions) <- c(-1,1)
pred <- prediction(predictions$`1`, test_total$label)
perf <- performance(pred, "acc")
plot(perf, avg= "vertical", spread.estimate="boxplot", 
     show.spread.at= seq(0.1, 0.9, by=0.1), 
     main="Adaboosting cutoff and performance tradeoff")

index<-which.max(slot(perf, "y.values")[[1]])
max<-slot(perf, "x.values")[[1]][index]
# ROC curve
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, print.cutoffs.at=c(max), text.adj=c(0.3,0), 
     avg="threshold", lwd=3, main= "Adaboosting curve for ROC")
abline(a=0,b=1)

Diagnostics 4d

#Misclassification pattern for random forest 
mis_label_random_forest<-train_data_rf[predicted.classes_rf!=test_total$label,]
ggplot(mis_label_random_forest)+
  geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate,
                              color = factor(label)))+
  xlab(" x coordinate") + ylab(" y coordinate") + 
  ggtitle("Mis-classification in random-forest model split method 1 ")+ 
  theme_bw()

ggplot(mis_label_random_forest)+geom_density(aes(x= CORR, group= label, 
                                      color=label, fill= label), 
                                  color= "black",alpha = 0.7)+
  ggtitle("RF Overlaying histogram of CORR split by block")+
  theme_minimal()

ggplot(mis_label_random_forest)+geom_density(aes(x= NDAI, group= label, 
                                      color=label, fill= label), 
                                  color= "black",alpha = 0.7)+
  ggtitle("RF Overlaying histogram of NDAI split by block")+
  theme_minimal()

ggplot(mis_label_random_forest)+geom_density(aes(x= SD, group= label, 
                                      color=label, fill= label), 
                                  color= "black",alpha = 0.7)+
  ggtitle("RF Overlaying histogram of SD split by block")+
  theme_minimal()

Predicted <-predicted.classes_rf
Actual<- test_total$label
fourfoldplot(table(Predicted, Actual))

#Method 2
mis_label_random_forest<- train_data_rf2[predicted.classes_rf_method_2!=test_total$label,]
ggplot(mis_label_random_forest)+ geom_point(alpha = 0.5, 
                                            aes(x= x_coordinate,y = y_coordinate,
                                                color = factor(label)))+
  xlab(" x coordinate") + ylab(" y coordinate")+
  ggtitle("Mis-classification in random-forest model split method 2 ")+theme_bw()

Predicted <-predicted.classes_rf_method_2
fourfoldplot(table(Predicted, Actual))

#Misclassification pattern for decision tree
mis_label_decision_tree<- train_data[tree.pred2!=test_total$label,]
ggplot(mis_label_decision_tree)+ geom_point(alpha = 0.5, 
                                            aes(x= x_coordinate, y = y_coordinate,
                                                color = factor(label)))+
  xlab(" x coordinate") + ylab(" y coordinate") + 
  ggtitle("Mis-classification in decision tree model split method 1 ")+ theme_bw()

Predicted <-tree.pred2
fourfoldplot(table(Predicted, Actual))

ggplot(mis_label_decision_tree)+geom_density(aes(x= CORR, group= label, 
                                      color=label, fill= label), 
                                  color= "red",alpha = 0.7)+
  ggtitle("DT Overlaying histogram of CORR split by block")+
  theme_minimal()

ggplot(mis_label_decision_tree)+geom_density(aes(x= NDAI, group= label, 
                                      color=label, fill= label), 
                                  color= "red",alpha = 0.7)+
  ggtitle("DT Overlaying histogram of NDAI split by block")+
  theme_minimal()

ggplot(mis_label_decision_tree)+geom_density(aes(x= SD, group= label, 
                                      color=label, fill= label), 
                                  color= "red",alpha = 0.7)+
  ggtitle("DT Overlaying histogram of SD split by block")+
  theme_minimal()

#Method 2
mis_label_decision_tree2<- train_data_tree_2[tree.pred_method_2!=test_total$label,]
ggplot(mis_label_decision_tree2)+ geom_point(alpha = 0.5, aes(x= x_coordinate, y = y_coordinate, color = factor(label)))+
  xlab(" x coordinate") + ylab(" y coordinate") + 
  ggtitle("Mis-classification in decision tree model split method 2 ")+ theme_bw()

Predicted <-tree.pred_method_2
fourfoldplot(table(Predicted, Actual))